In [17]:
import pandas as pd
import numpy as np
import re

from dateutil.parser import parse

In [18]:
data = pd.read_csv('Signs.csv')
data.head()


Out[18]:
longtitude latitude OBJECTID SG_KEY_BOR SG_ORDER_N SG_SEQNO_N SG_MUTCD_C SR_DIST SG_SIGN_FC SG_ARROW_D x y SIGNDESC1
0 -73.922335 40.836256 11919717 B P-132428 3 SP-287B 45 NaN NaN 1005740.86711 243957.356623 NO STANDING 10AM-6PM MON THRU FRI
1 -73.922335 40.836256 11919718 B P-132428 4 SP-672G 45 NaN NaN 1005740.86711 243957.356623 EXCEPT
2 -73.922335 40.836256 11919719 B P-132428 5 SP-579G 45 NaN NaN 1005740.86711 243957.356623 AMBULETTE
3 -73.922330 40.836352 11919720 B P-132428 6 SP-287BA 80 NaN S 1005742.32839 243992.461212 NO STANDING 10AM-6PM MON THRU FRI (SINGLE ARROW)
4 -73.922330 40.836352 11919721 B P-132428 7 SP-672G 80 NaN NaN 1005742.32839 243992.461212 EXCEPT

In [19]:
data['SIGNDESC1'].head()


Out[19]:
0                   NO STANDING 10AM-6PM MON THRU FRI
1                                              EXCEPT
2                                           AMBULETTE
3    NO STANDING 10AM-6PM MON THRU FRI (SINGLE ARROW)
4                                              EXCEPT
Name: SIGNDESC1, dtype: object

In [45]:
# https://regex101.com/r/fC0lI5/10
p = re.compile(r'(NOON|MIDNIGHT.*|[01]?[0-9]+:?[0-9]*(?:[AP]MM?)?)\s*(?:-|TO|\s)\s*(NOON|MIDNIGHT|[01]?[0-9]+:?[0-9]*(?:[AP]MM?)?)')

def extract_time(desc, group=1):
    m = p.search(desc)
    if m:
        time = m.group(group)
        if time == 'MIDNIGHT':
            return '12AM'
        elif time == 'MIDNIGHT TO':
            return '12AM'
        elif time == 'NOON':
            return '12PM'
        elif 'MM' in time:
            match_amm_or_pmm = re.compile(r'([0-9])*?([AP]MM?)')
            matched_time = match_amm_or_pmm.search(time)
            if matched_time:
                meridiem = 'AM' if matched_time.group(2) == 'AMM' else 'PM'
                oclock = matched_time.group(1)
                return '{0}{1}'.format(oclock, meridiem)
                
        return m.group(group)
    return np.nan

def from_time(signdesc):
    f_time = extract_time(signdesc, group=1)
    if f_time or not np.nan:
        return parse(f_time).strftime('%I:%M%p')
    return None

def to_time(signdesc):
    t_time = extract_time(signdesc, group=2)
    if t_time or not np.nan:
        return parse(t_time).strftime('%I:%M%p')
    return None

Special Cases

assert extract_time('1 HR MUNI-METER PARKING 10AM-7PM MON THRU FRI 8AM-7PM SATURDAY W/ SINGLE ARROW') == '' NO PARKING (SANITATION BROOM SYMBOL) 11:30AM TO 1 PM FRIW/ SINGLE ARROW

check if 2 timings is the maximum amount


In [46]:
row = 'NO PARKING (SANITATION BROOM SYMBOL) 7AM-7:30AM EXCEPT SUNDAY'
assert from_time(row) == '07:00AM'
assert to_time(row) == '07:30AM'

special_case1 = 'NO PARKING (SANITATION BROOM SYMBOL) 11:30AM TO 1PM THURS'
assert from_time(special_case1) == '11:30AM'
assert to_time(special_case1) == '01:00PM'

special_case2 = 'NO PARKING (SANITATION BROOM SYMBOL) MOON & STARS (SYMBOLS) TUESDAY FRIDAY MIDNIGHT-3AM'
assert from_time(special_case2) == '12:00AM'
assert to_time(special_case2) == '03:00AM'

special_case3 = 'TRUCK (SYMBOL) TRUCK LOADING ONLY MONDAY-FRIDAY NOON-2PM'
assert from_time(special_case3) == '12:00PM'
assert to_time(special_case3) == '02:00PM'

special_case4 = 'NIGHT REGULATION (MOON & STARS SYMBOLS) NO PARKING (SANITATION BROOM SYMBOL) MIDNIGHT TO-3AM WED & SAT'
assert from_time(special_case4) == '12:00AM'
assert to_time(special_case4) == '03:00AM'

special_case5 = 'NO PARKING (SANITATION BROOM SYMBOL)8AM 11AM TUES & THURS'
assert from_time(special_case5) == '08:00AM'
assert to_time(special_case5) == '11:00AM'

special_case6 = 'NO PARKING (SANITATION BROOM SYMBOL) MONDAY THURSDAY 7AMM-7:30AM'
assert from_time(special_case6) == '07:00AM'
assert to_time(special_case6) == '07:30AM'

In [50]:
def filter_from_time(row):
    if not pd.isnull(row['SIGNDESC1']):
        print(from_time(row['SIGNDESC1']))
        return from_time(row['SIGNDESC1'])
    return np.nan

In [51]:
def filter_to_time(row):
    if not pd.isnull(row['SIGNDESC1']):
        return to_time(row['SIGNDESC1'])
    return np.nan

In [52]:
data['FROM_TIME'] = data.apply(filter_from_time, axis=1)


10:00AM
10:00AM
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-52-92f28f1e659b> in <module>()
----> 1 data['FROM_TIME'] = data.apply(filter_from_time, axis=1)

/Users/ncharass/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3912                     if reduce is None:
   3913                         reduce = True
-> 3914                     return self._apply_standard(f, axis, reduce=reduce)
   3915             else:
   3916                 return self._apply_broadcast(f, axis)

/Users/ncharass/anaconda/lib/python3.5/site-packages/pandas/core/frame.py in _apply_standard(self, func, axis, ignore_failures, reduce)
   4004             try:
   4005                 for i, v in enumerate(series_gen):
-> 4006                     results[i] = func(v)
   4007                     keys.append(v.name)
   4008             except Exception as e:

<ipython-input-50-535f2b1276f1> in filter_from_time(row)
      1 def filter_from_time(row):
      2     if not pd.isnull(row['SIGNDESC1']):
----> 3         print(from_time(row['SIGNDESC1']))
      4         return from_time(row['SIGNDESC1'])
      5     return np.nan

<ipython-input-45-8956e79faa0f> in from_time(signdesc)
     26     f_time = extract_time(signdesc, group=1)
     27     if f_time or not np.nan:
---> 28         return parse(f_time).strftime('%I:%M%p')
     29     return None
     30 def to_time(signdesc):

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1006         return parser(parserinfo).parse(timestr, **kwargs)
   1007     else:
-> 1008         return DEFAULTPARSER.parse(timestr, **kwargs)
   1009 
   1010 

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    390             res, skipped_tokens = self._parse(timestr, **kwargs)
    391         else:
--> 392             res = self._parse(timestr, **kwargs)
    393 
    394         if res is None:

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
    490 
    491         res = self._result()
--> 492         l = _timelex.split(timestr)         # Splits the timestr into tokens
    493 
    494         # keep up with the last token skipped so we can recombine

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in split(cls, s)
    172 
    173     def split(cls, s):
--> 174         return list(cls(s))
    175     split = classmethod(split)
    176 

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in __next__(self)
    162 
    163     def __next__(self):
--> 164         token = self.get_token()
    165         if token is None:
    166             raise StopIteration

/Users/ncharass/anaconda/lib/python3.5/site-packages/dateutil/parser.py in get_token(self)
     80                 nextchar = self.charstack.pop(0)
     81             else:
---> 82                 nextchar = self.instream.read(1)
     83                 while nextchar == '\x00':
     84                     nextchar = self.instream.read(1)

AttributeError: ("'float' object has no attribute 'read'", 'occurred at index 1')

In [ ]:
data['TO_TIME'] = data.apply(filter_to_time, axis=1)

In [ ]:
data[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']].head(10)

Find out if any rows has NaN

Want to find out if any rows has NaN from from_time and to_time but has timing in SIGNDESC1


In [ ]:
rows_with_AM_PM_but_time_NaN = data[(data['FROM_TIME'].isnull() | data['FROM_TIME'].isnull()) & (data['SIGNDESC1'].str.contains('[0-9]+(?:[AP]M)'))]

In [ ]:
len(rows_with_AM_PM_but_time_NaN)

In [ ]:
rows_with_AM_PM_but_time_NaN[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']]

In [ ]:
data.iloc[180670, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[180670, data.columns.get_loc('FROM_TIME')] = '9AM'
data.iloc[180670, data.columns.get_loc('TO_TIME')] = '4AM'

In [ ]:
data.iloc[212089, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[212089, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[212089, data.columns.get_loc('TO_TIME')] = '11:30AM'

In [ ]:
data.iloc[258938, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[258938, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258938, data.columns.get_loc('TO_TIME')] = '11:30AM'

In [ ]:
data.iloc[258942, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[258942, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258942, data.columns.get_loc('TO_TIME')] = '11:30AM'

In [ ]:
data.iloc[258944, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[258944, data.columns.get_loc('FROM_TIME')] = '10AM'
data.iloc[258944, data.columns.get_loc('TO_TIME')] = '11:30AM'

In [ ]:
data.iloc[283262, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.iloc[283262, data.columns.get_loc('FROM_TIME')] = '6AM'
data.iloc[283262, data.columns.get_loc('TO_TIME')] = '7:30AM'

Confirm that every row has from_time and to_time


In [ ]:
rows_with_AM_PM_but_time_NaN = data[(data['FROM_TIME'].isnull() | data['FROM_TIME'].isnull()) & (data['SIGNDESC1'].str.contains('[0-9]+(?:[AP]M)'))]
len(rows_with_AM_PM_but_time_NaN)

In [ ]:
data[['SIGNDESC1', 'FROM_TIME', 'TO_TIME']]

Day of the week


In [ ]:
data['SIGNDESC1'].head(20)

In [ ]:
#https://regex101.com/r/fO4zL8/3
regex_to_extract_days_idv_days = r'\b((?:(?:MON|MONDAY|TUES|TUESDAY|WED|WEDNESDAY|THURS|THURSDAY|FRI|FRIDAY|SAT|SATURDAY|SUN|SUNDAY)\s*)+)(?=\s|$)'
regex_to_extract_days_with_range = r'(MON|TUES|WED|THURS|FRI|SAT|SUN)\s(THRU|\&)\s(MON|TUES|WED|THURS|FRI|SAT|SUN)'

def extract_day(signdesc):
    days = ['MON', 'TUES', 'WED', 'THURS', 'FRI', 'SAT', 'SUN']
    p_idv_days = re.compile(regex_to_extract_days_idv_days)
    m_idv_days = p_idv_days.search(signdesc)
    
    p_range_days = re.compile(regex_to_extract_days_with_range)
    m_range_days = p_range_days.search(signdesc)
    
    if 'EXCEPT SUN' in signdesc:
        return ', '.join(days[:6])
    
    if 'INCLUDING SUNDAY' in signdesc:
        return ', '.join(days)
    
    if 'FRIW/' in signdesc:
        return ', '.join(['FRI'])
    
    if ('THRU' in signdesc) and m_range_days:
        from_day = m_range_days.group(1)
        to_day = m_range_days.group(3)

        idx_frm_d = days.index(from_day)
        idx_to_d = days.index(to_day)
        return ', '.join([days[n] for n in range(idx_frm_d, idx_to_d + 1)])
    
    if ('&' in signdesc) and m_range_days:
        from_day = m_range_days.group(1)
        to_day = m_range_days.group(3)
        
        return ', '.join([from_day, to_day])
    
    if m_idv_days:
        days = m_idv_days.group(1)
        d = []
        for day in days.split(' '):
            if len(day) > 3:
                if day in ['MONDAY', 'WEDNESDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']:
                    d.append(day[:3])
                if day in ['TUESDAY']:
                    d.append(day[:4])
                if day in ['THURSDAY']:
                    d.append(day[:5])
            else:
                d.append(day)
                    
        return ', '.join(d)
            
    return np.nan

In [ ]:
def filter_days(row):
    if not pd.isnull(row['SIGNDESC1']):
        return extract_day(row['SIGNDESC1'])
    return np.nan

In [ ]:
assert extract_day('NO STANDING 11AM-7AM MON SAT') == "MON, SAT"
assert extract_day('NO STANDING MON FRI 7AM-9AM') == "MON, FRI"
assert extract_day('2 HOUR PARKING 9AM-5PM MON THRU SAT') == "MON, TUES, WED, THURS, FRI, SAT"
assert extract_day('1 HOUR PARKING 8AM-7PM EXCEPT SUNDAY') == "MON, TUES, WED, THURS, FRI, SAT"
assert extract_day('NO PARKING 10PM-8AM INCLUDING SUNDAY') == "MON, TUES, WED, THURS, FRI, SAT, SUN"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) MONDAY THURSDAY 9:30AM-11AM') == "MON, THURS"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) 11:30AM TO 1 PM FRIW/ SINGLE ARROW') == "FRI"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) 8-9:30AM TUES & FRI') == "TUES, FRI"
assert extract_day('NO PARKING (SANITATION BROOM SYMBOL) TUESDAY FRIDAY 11AM-12:30PM') == "TUES, FRI"

In [ ]:
data['DAYS'] = data.apply(filter_days, axis=1)

In [ ]:
rows_with_days_but_DAYS_NAN = data[data['DAYS'].isnull() & data['SIGNDESC1'].str.contains('\sMON|\sTUES|\sWED|\sTHURS|\sFRI|\sSAT|\sSUN')]

In [ ]:
rows_with_days_but_DAYS_NAN[['SIGNDESC1', 'DAYS']]

In [ ]:
data.iloc[308838, data.columns.get_loc('SIGNDESC1')]

In [ ]:
data.head()

Save to CSV


In [ ]:
data.to_csv('Processed_Signs.csv', index=False)

In [ ]: